import os
os.chdir('/data/l989o/deployed/a')
import sys
if '/data/l989o/a' in sys.path:
sys.path.remove('/data/l989o/a')
import scanpy as sc
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import os
import h5py
import pickle
import pandas as pd
from data import file_path
from tqdm.notebook import tqdm
from data import TransformedMeanDataset
ds0 = TransformedMeanDataset('train')
n = 0
for x in ds0:
n += len(x)
print(n)
ds1 = TransformedMeanDataset('validation')
v = 0
for x in ds1:
v += len(x)
print(v)
print(n + v)
class Quantities:
def __init__(self):
self.phenographs = dict()
self.adata = dict()
self.num_phenograph_classes = dict()
self.all_fractions = dict()
self.omes = dict()
q = Quantities()
for method in ['raw', 'transformed', 'vae_mu']:
f = file_path(f'umap_{method}.adata')
a = ad.read(f)
print(a)
a = a[:n].copy()
print(a)
index_info_omes, index_info_begins, index_info_ends = pickle.load(open(file_path('merged_cells_info.pickle'), 'rb'))
print(index_info_ends[-1])
l = []
b = file_path(f'phenograph_{method}.hdf5')
with h5py.File(b, 'r') as f:
for i, o in enumerate(index_info_omes):
phenograph = f[o]['phenograph'][...].reshape((-1, 1))
assert len(phenograph) == index_info_ends[i] - index_info_begins[i]
l.append(phenograph)
phenographs = np.concatenate(l, axis=0)
phenographs.shape
q.phenographs[method] = phenographs
s = pd.Series(phenographs.flatten(), dtype='category')
print(s)
s.index = a.obs.index
a.obs['phenograph'] = s
display(a.obs)
q.adata[method] = a
# sc.pl.umap(a, color='phenograph')
num_phenograph_classes = np.max(phenographs) + 1
q.num_phenograph_classes[method] = num_phenograph_classes
print(num_phenograph_classes)
a = len(set(phenographs.flatten().tolist()))
assert num_phenograph_classes == a, a
phenograph_fractions = dict()
with h5py.File(b, 'r') as f:
q.omes[method] = list(f.keys())
for i, o in tqdm(enumerate(index_info_omes), desc='computing fractions'):
phenograph = f[o]['phenograph'][...].reshape((-1, 1))
assert len(phenograph) == index_info_ends[i] - index_info_begins[i]
fractions = np.zeros((num_phenograph_classes))
for p in phenograph.flatten():
fractions[p] += 1
fractions /= np.sum(fractions)
phenograph_fractions[o] = fractions.reshape((1, -1))
print(len(phenograph_fractions))
all_fractions = np.concatenate(list(phenograph_fractions.values()), axis=0)
print(all_fractions.shape)
q.all_fractions[method] = all_fractions
q.fractions_u = dict()
q.dbscan_labels = dict()
for method in ['raw', 'transformed', 'vae_mu']:
print('-' * 100)
print('method =', method)
from umap import UMAP
reducer = UMAP(2, verbose=True)
u = reducer.fit_transform(q.all_fractions[method])
q.fractions_u[method] = u
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=1, min_samples=2).fit(u)
print(clustering.labels_)
print(clustering)
q.dbscan_labels[method] = clustering.labels_
print(type(clustering.labels_))
print(clustering.labels_.shape)
for method in ['raw', 'transformed', 'vae_mu']:
labels = q.dbscan_labels[method]
u = q.fractions_u[method]
plt.figure(figsize=(9, 9))
colors = np.random.random((1000, 3))[labels]
plt.scatter(u[:, 0], u[:, 1], c=colors, alpha=0)
ax = plt.gca()
texts = list(map(str, labels))
for i in range(len(u)):
ax.annotate(texts[i], (u[i, 0], u[i, 1]), color=colors[i])
# plt.text(u[:, 0], u[:, 1], texts, color=colors)
plt.title(f'found {max(labels)} dbscan clusters')
plt.show()
jitter = (np.random.rand(len(u)) - 0.5) / 25
import numpy as np
random_colors = np.random.rand(4, 3)
hot_indices = [39, 4, 29, 22]
patients_per_hot_index = dict()
for h in hot_indices:
l = np.where(q.dbscan_labels['raw'] == h)[0]
patients_per_hot_index[h] = l
for method in ['raw', 'transformed', 'vae_mu']:
labels = q.dbscan_labels[method]
u = q.fractions_u[method]
plt.figure(figsize=(20, 20))
colors = np.tile([0., 0., 0.], (len(labels), 1))
hot_indices = [39, 4, 29, 22]
for color, index in zip(random_colors, hot_indices):
# matches = np.where(labels == index)
# print(len(matches[0]))
# colors[matches, :] = color
colors[patients_per_hot_index[index], :] = color
# jitter = (np.random.rand(len(u)) - 0.5) / 25
jitter_x = jitter * (np.max(u[:, 0]) - np.min(u[:, 0]))
jitter_y = jitter * (np.max(u[:, 1]) - np.min(u[:, 1]))
plt.scatter(u[:, 0] + jitter_x, u[:, 1] + jitter_y, c=colors, alpha=0)
plt.scatter(u[:, 0], u[:, 1], c=colors, alpha=0)
ax = plt.gca()
texts = list(map(str, labels))
for i in range(len(u)):
ax.annotate(texts[i], (u[i, 0], u[i, 1]), color=colors[i])
# plt.text(u[:, 0], u[:, 1], texts, color=colors)
plt.title(f'found {max(labels)} dbscan clusters')
plt.show()
plt.figure()
methods = ['raw', 'transformed', 'vae_mu']
for method in methods:
a = q.dbscan_labels[method]
print(np.unique(a, return_counts=True))
h, bin_edges = np.histogram(a, max(a) + 1)
bin_centers = 0.5*(bin_edges[1:] + bin_edges[:-1])
h = [-a for a in sorted(-h)]
plt.step(bin_centers, h, where='mid')
# s = sorted(bins)
# plt.bar(mids, s)
plt.show()
from sklearn.metrics import adjusted_rand_score
from operator import itemgetter
def get_m(method0, method1):
print('-' * 100)
n0 = max(q.dbscan_labels[method0]) + 1
n1 = max(q.dbscan_labels[method1]) + 1
labels0 = q.dbscan_labels[method0]
labels1 = q.dbscan_labels[method1]
print('score:', adjusted_rand_score(labels0, labels1))
plt.figure()
plt.scatter(q.dbscan_labels[method0], q.dbscan_labels[method1])
plt.show()
counts0 = dict(zip(*np.unique(labels0, return_counts=True)))
indices, sorted_labels0 = zip(*sorted(enumerate(labels0), key=lambda x: (-counts0[x[1]], x[1])))
# print(sorted_labels0)
seen = set()
no_duplicates = [x for x in sorted_labels0 if not (x in seen or seen.add(x))]
l = list(range(max(seen) + 1))
assert len(l) == len(no_duplicates)
my_map = zip(l, no_duplicates)
relabeled0 = [no_duplicates.index(i) for i in sorted_labels0]
# print(relabeled0)
s = adjusted_rand_score(sorted_labels0, relabeled0)
assert np.isclose(s, 1.), s
sorted_labels1 = [labels1[i] for i in indices]
# print(sorted_labels1)
seen = set()
no_duplicates = [x for x in sorted_labels1 if not (x in seen or seen.add(x))]
l = list(range(max(seen) + 1))
assert len(l) == len(no_duplicates)
my_map = zip(l, no_duplicates)
relabeled1 = [no_duplicates.index(i) for i in sorted_labels1]
# print(relabeled0)
s = adjusted_rand_score(sorted_labels1, relabeled1)
assert np.isclose(s, 1.), s
# counts1 = dict(zip(*np.unique(labels1, return_counts=True)))
# sorted_labels1 = sorted(labels1, key=lambda x: (-counts1[x], x))
plt.figure()
print(list(zip(relabeled0, sorted_labels0)))
print(list(zip(relabeled1, sorted_labels1)))
plt.scatter(relabeled0, sorted_labels0)
plt.show()
m = np.zeros((n0, n1))
for a, b in zip(labels0, labels1):
# for a, b in zip(relabeled0, relabeled1):
m[a, b] += 1
m_rows = m.copy()
m_cols = m.copy()
m_rows = m_rows / np.sum(m_rows, axis=0, keepdims=True)
m_cols = m_cols / np.sum(m_cols, axis=1, keepdims=True)
plt.figure()
plt.imshow(m, cmap='inferno')
plt.colorbar()
plt.show()
plt.figure()
plt.imshow(m_rows)
plt.colorbar()
plt.show()
plt.figure()
plt.imshow(m_cols)
plt.colorbar()
plt.show()
return m
get_m('raw', 'transformed')
get_m('transformed', 'vae_mu')
get_m('vae_mu', 'raw')
from metadata import get_metadata
df = get_metadata()
display(df)
print(df['Subtype'].value_counts())
cohort_info = df.loc[df['FileName_FullStack'].isin(q.omes['raw']), 'cohort'].apply(lambda x: 0 if x == 'basel' else 1).to_numpy()
def f(x):
assert type(x) == float and np.isnan(x) or x in ['PR+ER+', 'PR-ER-', 'PR-ER+', 'PR+ER-'], x
if x == 'PR+ER+':
return 0
if x == 'PR-ER-':
return 1
if x == 'PR-ER+':
return 2
if x == 'PR+ER-':
return 3
if type(x) == float and np.isnan(x):
return 4
assert False
tumor_info = df.loc[df['FileName_FullStack'].isin(q.omes['raw']), 'Subtype'].apply(f).to_numpy()
print(cohort_info.shape)
print(tumor_info.shape)
random_colors = np.random.rand(4, 3)
for method in ['raw', 'transformed', 'vae_mu']:
labels = q.dbscan_labels[method]
u = q.fractions_u[method]
plt.figure(figsize=(9, 9))
colors = np.tile([0., 0., 0.], (len(labels), 1))
two_colors = np.array([[1., 0., 0.], [0., 0., 1.]])
hot_indices = [39, 4, 29, 22]
for color, index in zip(random_colors, hot_indices):
matches = np.where(labels == index)
print(len(matches[0]))
colors[matches, :] = color
plt.scatter(u[:, 0], u[:, 1], c=cohort_info, alpha=0)
ax = plt.gca()
texts = list(map(str, labels))
cohort_colors = two_colors[cohort_info, :]
for i in range(len(u)):
ax.annotate(texts[i], (u[i, 0], u[i, 1]), color=cohort_colors[i, :])
# plt.text(u[:, 0], u[:, 1], texts, color=colors)
plt.title(f'found {max(labels)} dbscan clusters')
plt.show()
random_colors = np.random.rand(5, 3)
for method in ['raw', 'transformed', 'vae_mu']:
labels = q.dbscan_labels[method]
u = q.fractions_u[method]
plt.figure(figsize=(9, 9))
colors = np.tile([0., 0., 0.], (len(labels), 1))
hot_indices = [39, 4, 29, 22]
for color, index in zip(random_colors, hot_indices):
matches = np.where(labels == index)
print(len(matches[0]))
colors[matches, :] = color
plt.scatter(u[:, 0], u[:, 1], alpha=0)
ax = plt.gca()
texts = list(map(str, labels))
cohort_colors = random_colors[tumor_info, :]
for i in range(len(u)):
ax.annotate(texts[i], (u[i, 0], u[i, 1]), color=cohort_colors[i, :])
# plt.text(u[:, 0], u[:, 1], texts, color=colors)
plt.title(f'found {max(labels)} dbscan clusters')
from matplotlib.lines import Line2D
custom_lines = [Line2D([0], [0], color=random_colors[i], lw=4) for i in range(5)]
ax.legend(custom_lines, ['PR+ER+', 'PR-ER-', 'PR-ER+', 'PR+ER-', 'nan'])
plt.show()
df
patient_of_origin = df.loc[df['FileName_FullStack'].isin(q.omes['raw']), 'merged_pid'].to_numpy()
patient_of_origin
random_colors = np.random.rand(1000, 3)
for method in ['raw', 'transformed', 'vae_mu']:
labels = q.dbscan_labels[method]
u = q.fractions_u[method]
plt.figure(figsize=(9, 9))
plt.scatter(u[:, 0], u[:, 1], alpha=0)
ax = plt.gca()
texts = list(map(str, labels))
cohort_colors = random_colors[patient_of_origin, :]
for i in range(len(u)):
ax.annotate(texts[i], (u[i, 0], u[i, 1]), color=cohort_colors[i, :])
# plt.text(u[:, 0], u[:, 1], texts, color=colors)
plt.title(f'found {max(labels)} dbscan clusters')
plt.show()
from sklearn.metrics import adjusted_rand_score
from operator import itemgetter
def get_m_patients(method0):
print('-' * 100)
n0 = max(q.dbscan_labels[method0]) + 1
labels0 = q.dbscan_labels[method0]
print('score:', adjusted_rand_score(labels0, patient_of_origin))
plt.figure()
plt.scatter(q.dbscan_labels[method0], patient_of_origin, c=patient_of_origin)
plt.show()
counts0 = dict(zip(*np.unique(labels0, return_counts=True)))
indices, sorted_labels0 = zip(*sorted(enumerate(labels0), key=lambda x: (-counts0[x[1]], x[1])))
# print(sorted_labels0)
seen = set()
no_duplicates = [x for x in sorted_labels0 if not (x in seen or seen.add(x))]
l = list(range(max(seen) + 1))
assert len(l) == len(no_duplicates)
my_map = zip(l, no_duplicates)
relabeled0 = [no_duplicates.index(i) for i in sorted_labels0]
# print(relabeled0)
s = adjusted_rand_score(sorted_labels0, relabeled0)
assert np.isclose(s, 1.), s
# print(labels0)
# print(sorted_labels0)
# print(relabeled0)
print(indices)
plt.figure()
plt.scatter(np.arange(len(indices)), indices, c=cohort_info)
ax = plt.gca()
ax.set_aspect('equal')
plt.show()
sorted_patient_of_origin = [patient_of_origin[i] for i in indices]
make_indices_adjacent = dict(zip(sorted(list(set(sorted_patient_of_origin))), list(range(len(set(sorted_patient_of_origin))))))
plt.figure()
plt.scatter(relabeled0, [make_indices_adjacent[s] for s in sorted_patient_of_origin], c=[cohort_info[i] for i in indices])
plt.show()
return
m = np.zeros((n0, len(make_indices_adjacent)))
for a, b in zip(relabeled0, sorted_patient_of_origin):
# for a, b in zip(relabeled0, relabeled1):
m[a, make_indices_adjacent[b]] += 1
m_rows = m.copy()
m_cols = m.copy()
m_rows = m_rows / np.sum(m_rows, axis=0, keepdims=True)
m_cols = m_cols / np.sum(m_cols, axis=1, keepdims=True)
plt.figure(figsize=(20, 9))
plt.imshow(m, cmap='inferno')
plt.colorbar()
plt.show()
plt.figure(figsize=(20, 9))
plt.imshow(m_rows)
plt.colorbar()
plt.show()
plt.figure(figsize=(20, 9))
plt.imshow(m_cols)
plt.colorbar()
plt.show()
return m
get_m_patients('raw')
# get_m_patients('transformed')
# get_m_patients('vae_mu')
from sklearn.metrics import adjusted_rand_score
from operator import itemgetter
method0 = 'raw'
print('-' * 100)
n0 = max(q.dbscan_labels[method0]) + 1
labels0 = q.dbscan_labels[method0]
print('score:', adjusted_rand_score(labels0, patient_of_origin))
plt.figure()
plt.scatter(q.dbscan_labels[method0], patient_of_origin, c=patient_of_origin)
plt.show()
counts0 = dict(zip(*np.unique(labels0, return_counts=True)))
indices, sorted_labels0 = zip(*sorted(enumerate(labels0), key=lambda x: (-counts0[x[1]], x[1])))
print(len(sorted_labels0))
print(len(cohort_info))
print(len(patient_of_origin))
make_indices_adjacent = dict(zip(sorted(list(set(patient_of_origin))), list(range(len(set(patient_of_origin))))))
print(len(make_indices_adjacent))
patient_of_origin_adjacent = [make_indices_adjacent[p] for p in patient_of_origin]
print(len(patient_of_origin_adjacent))
plt.figure()
plt.scatter(patient_of_origin, patient_of_origin_adjacent)
plt.show()
plt.figure()
plt.scatter(q.dbscan_labels[method0], patient_of_origin_adjacent, c=cohort_info)
plt.show()
sorted_patient_of_origin_adjacent = [patient_of_origin_adjacent[i] for i in indices]
sorted_cohort_info = [cohort_info[i] for i in indices]
print(len(sorted_patient_of_origin_adjacent))
print(len(sorted_cohort_info))
plt.figure()
plt.scatter(np.arange(len(sorted_patient_of_origin_adjacent)), sorted_patient_of_origin_adjacent, c=sorted_cohort_info)
plt.show()
seen = set()
no_duplicates = [x for x in sorted_labels0 if not (x in seen or seen.add(x))]
l = list(range(max(seen) + 1))
assert len(l) == len(no_duplicates)
my_map = zip(l, no_duplicates)
relabeled0 = [no_duplicates.index(i) for i in sorted_labels0]
# print(relabeled0)
s = adjusted_rand_score(sorted_labels0, relabeled0)
assert np.isclose(s, 1.), s
# print(labels0)
# print(sorted_labels0)
# print(relabeled0)
plt.figure()
plt.scatter(np.arange(len(indices)), indices, c=cohort_info)
ax = plt.gca()
ax.set_aspect('equal')
plt.show()
plt.figure()
plt.scatter(relabeled0, sorted_patient_of_origin_adjacent, c=sorted_cohort_info)
plt.show()
m = np.zeros((n0, len(make_indices_adjacent)))
for a, b in zip(relabeled0, sorted_patient_of_origin_adjacent):
# for a, b in zip(relabeled0, relabeled1):
m[a, b] += 1
m_rows = m.copy()
m_cols = m.copy()
m_rows = m_rows / np.sum(m_rows, axis=0, keepdims=True)
m_cols = m_cols / np.sum(m_cols, axis=1, keepdims=True)
plt.figure(figsize=(20, 9))
plt.imshow(m, cmap='inferno')
plt.colorbar()
plt.show()
plt.figure(figsize=(20, 9))
plt.imshow(m_rows)
plt.colorbar()
plt.show()
plt.figure(figsize=(20, 9))
plt.imshow(m_cols)
plt.colorbar()
plt.show()
df
import torch
def get_filtered_labels_mapping(ome_index, ome_filename, split):
import pickle
f = os.path.join('data/spatial_uzh_processed/a', f'ok_cells_{split}.npy')
d = pickle.load(open(f, 'rb'))
list_of_cells = d['list_of_cells']
list_of_ome_filenames = d['list_of_ome_filenames']
list_of_ome_indices = d['list_of_ome_indices']
list_of_cell_ids = d['list_of_cell_ids']
cell_is_ok = d['cell_is_ok']
begin = list_of_ome_filenames.index(ome_filename)
end = len(list_of_ome_filenames) - list_of_ome_filenames[::-1].index(ome_filename)
# print(list_of_ome_filenames[begin])
# print(list_of_ome_filenames[end - 1])
# print(list_of_ome_filenames[end])
# print(list_of_ome_filenames[end + 1])
oks = cell_is_ok[begin: end]
# labels = list_of_cell_ids[begin: end]
# print(oks.shape)
# print(t.shape)
l0 = np.array(list(range(np.sum(oks).item())))
l1 = list_of_cell_ids[ome_index][oks]
assert len(l0) == len(l1)
d = dict(zip(l0, l1))
return d
# STEP 1: get all the expression values
from data import RawMeanDataset, TransformedMeanDataset, MasksDataset
index_info_omes, index_info_begins, index_info_ends = pickle.load(open(file_path('merged_cells_info.pickle'), 'rb'))
expressions = dict()
for method in methods:
if method == 'raw':
ds = RawMeanDataset('train')
l = []
for x in tqdm(ds, 'merging raw expresions'):
l.append(x.numpy())
expressions[method] = np.concatenate(l, axis=0)
elif method == 'transformed':
ds = TransformedMeanDataset('train')
l = []
for x in tqdm(ds, 'merging transformed expresions'):
l.append(x.numpy())
expressions[method] = np.concatenate(l, axis=0)
elif method == 'vae_mu':
f = os.path.join(file_path('vae_transformed_mean_dataset_LR_VB_S_0.0014685885989200848__3.8608662714605464e-08__False'), 'embedding_train.hdf5')
with h5py.File(f, 'r') as f5:
assert len(f5.keys()) == 1
k, v = f5.items().__iter__().__next__()
raw_ds = RawMeanDataset('train')
o_train = raw_ds.filenames
original_list = []
mu_list = []
log_var_list = []
for i, o in enumerate(tqdm(o_train, desc='accessing latent space')):
original = raw_ds[i].clone().detach().numpy()
mu = v[o]['mu'][...]
log_var = v[o]['log_var'][...]
assert mu.shape == log_var.shape
assert len(original) == len(mu)
original_list.append(original)
mu_list.append(mu)
log_var_list.append(log_var)
expressions[method] = np.concatenate(l, axis=0)
else:
raise ValueError()
# STEP 2: compute the PCA, affine transform into [0, 1]
from sklearn.decomposition import PCA
pca = dict()
for method in methods:
reducer = PCA(n_components=3)
pca[method] = reducer.fit_transform(expressions[method])
a = np.min(pca[method], axis=0)
b = np.max(pca[method], axis=0)
pca[method] = (pca[method] - a) / (b - a)
# STEP 4: in the loop below replace the mess with selecting from the PCA of everything
# STEP 5: fix the plotting function to plot the right things
from pprint import pprint
q.top = dict()
for method in methods:
labels = q.dbscan_labels[method]
counts = dict(zip(*np.unique(labels, return_counts=True)))
indices, sorted_labels = zip(*sorted(enumerate(labels), key=lambda x: (-counts[x[1]], x[1])))
# print(sorted_labels)
top_full = sorted(zip(*np.unique(labels, return_counts=True)), key=lambda x: -x[1])
top = [t[0] for t in top_full]
q.top[method] = top
print(top[:3])
import math
masks_ds_train = MasksDataset('train')
# for method in methods:
method = 'vae_mu'
if True:
for h in q.top[method][:8]:
patients_per_hot_index = np.where(q.dbscan_labels[method] == h)[0]
print(patients_per_hot_index)
print('=' * 100)
print(f'methods = {method}, hot_index = {h}')
l = len(patients_per_hot_index)
d = 4
cols = 5
rows = math.ceil(l / cols)
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols * d, rows * d))
axes = axes.flatten()
for i, ome_index in enumerate(tqdm(patients_per_hot_index, desc='plotting omes')):
# print(f'ome_index = {ome_index}')
begin = index_info_begins[ome_index]
end = index_info_ends[ome_index]
ome_filename = masks_ds_train.filenames[ome_index]
masks = masks_ds_train[ome_index]
d = get_filtered_labels_mapping(ome_index, ome_filename, 'train')
new_masks = np.ones((masks.shape[0], masks.shape[1], 4))
omitted_labels = set(list(range(np.max(masks)))).difference([dd.item() for dd in d.values()]).difference({0})
for l in omitted_labels:
new_masks[masks == l, :] = (0., 0., 0., 1.)
new_masks[masks == 0, :] = (0., 0., 0., 1.)
# kk = np.array([k for k in d.keys()])
# vv = np.array([v.item() for v in d.values()])
# # not working, but I should use of this kind
# new_masks[masks == vv, :3] = pca[method][begin: end][kk, :]
for k, v in d.items():
new_masks[masks == v.item(), :3] = pca[method][begin: end][k, :]
axes[i].imshow(new_masks)
axes[i].set_title(f'ome_index = {ome_index}')
for j in range(i, rows * cols):
axes[j].axis('off')
plt.suptitle(f'3-dim pca (computed globally) of transformed cell-level data, mapped into the RGB space')
plt.tight_layout()
plt.show()
import math
masks_ds_train = MasksDataset('train')
# for method in methods:
method = 'vae_mu'
if True:
random_colors = np.random.rand(1000, 3)
for h in q.top[method][:8]:
patients_per_hot_index = np.where(q.dbscan_labels[method] == h)[0]
print(patients_per_hot_index)
print('=' * 100)
print(f'methods = {method}, hot_index = {h}')
l = len(patients_per_hot_index)
d = 4
cols = 5
rows = math.ceil(l / cols)
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols * d, rows * d))
axes = axes.flatten()
for i, ome_index in enumerate(tqdm(patients_per_hot_index, desc='plotting omes')):
# print(f'ome_index = {ome_index}')
begin = index_info_begins[ome_index]
end = index_info_ends[ome_index]
ome_filename = masks_ds_train.filenames[ome_index]
masks = masks_ds_train[ome_index]
d = get_filtered_labels_mapping(ome_index, ome_filename, 'train')
new_masks = np.ones((masks.shape[0], masks.shape[1], 4))
omitted_labels = set(list(range(np.max(masks)))).difference([dd.item() for dd in d.values()]).difference({0})
for l in omitted_labels:
new_masks[masks == l, :] = (0., 0., 0., 1.)
new_masks[masks == 0, :] = (0., 0., 0., 1.)
# kk = np.array([k for k in d.keys()])
# vv = np.array([v.item() for v in d.values()])
# # not working, but I should use of this kind
# new_masks[masks == vv, :3] = pca[method][begin: end][kk, :]
for k, v in d.items():
new_masks[masks == v.item(), :3] = random_colors[q.phenographs[method][begin: end][k, :]]
axes[i].imshow(new_masks)
axes[i].set_title(f'ome_index = {ome_index}')
for j in range(i + 1, rows * cols):
axes[j].axis('off')
plt.suptitle(f'3-dim pca (computed globally) of transformed cell-level data, mapped into the RGB space')
plt.tight_layout()
plt.show()
import math
masks_ds_train = MasksDataset('train')
# for method in methods:
method = 'vae_mu'
if True:
for h in q.top[method][:8]:
patients_per_hot_index = np.where(q.dbscan_labels[method] == h)[0]
print(patients_per_hot_index)
print('=' * 100)
print(f'methods = {method}, hot_index = {h}')
l = len(patients_per_hot_index)
d = 4
cols = 5
rows = math.ceil(l / cols)
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols * d, rows * d))
axes = axes.flatten()
for i, ome_index in enumerate(tqdm(patients_per_hot_index, desc='plotting omes')):
f = q.all_fractions[method][ome_index, :]
axes[i].bar(np.arange(len(f)), f, color=[random_colors[j] for j in np.arange(len(f))])
axes[i].set_title(f'ome_index = {ome_index}')
for j in range(i + 1, rows * cols):
axes[j].axis('off')
plt.suptitle(f'3-dim pca (computed globally) of transformed cell-level data, mapped into the RGB space')
plt.tight_layout()
plt.show()
m = q.all_fractions['vae_mu'].T
plt.matshow(m)
import scipy
import scipy.cluster.hierarchy as sch
# vector of pairwise distances
d = sch.distance.pdist(m)
L = sch.linkage(d, method='complete')
ind = sch.fcluster(L, 0.5*d.max(), 'distance')
ind
ii, cc = zip(*sorted(zip(range(len(ind)), ind), key=lambda x: x[1]))
ii = np.array(ii)
plt.matshow(m[ii, :])
m = m.T
d = sch.distance.pdist(m)
L = sch.linkage(d, method='complete')
ind = sch.fcluster(L, 0.5*d.max(), 'distance')
ind
ii, cc = zip(*sorted(zip(range(len(ind)), ind), key=lambda x: x[1]))
ii = np.array(ii)
plt.matshow(m[ii, :].T)
import seaborn as sns
df = pd.DataFrame(m)
a = sns.clustermap(df)
b = a.data2d.index.to_numpy()
i = np.arange(len(b))
plt.plot(i, b)
plt.show()
plt.plot(i, cohort_info[b[i]], '-o')
plt.show()